Correlation of Country Goverment Index & GDP Captia

Author

Brian Kwong, Connor Gamba, Joey Bailitz, Arneh Begi

Import the Needed Libraries

Please make sure you have the following libraries installed:

  1. Tidyverse
  2. Here
  3. Janitor
  4. Styler
  5. Reactablefmtr
  6. gganimate
  7. gifski
Code
# Set output scope
options(scipen = 0)

# Clears environment
remove(list = as.vector(ls()))

#| output: False
library(tidyverse)
library(janitor)
library(styler)
library(reactablefmtr)
library(gganimate)
library(gifski)

# Formats documents:
style_file(path = here::here("Src", "Correlation_of_Country_Goverment_Index_&_GDP_Captia.qmd"), style = tidyverse_style, strict = TRUE)

Introduction

Background

The following data contains data collected on countries GDP and independent international calculation of a absence of corruption index from a variety of factors including government transparency, public official use of state dollars for personal gain and general trust in government officials being truthful in their actions between the years 1975 and 2021. Higher absence corruption index signals lower perceived corruption.

Our Data

Importing the Data

Code
curruption_data <- read_csv(here::here("Datasets", "abscorrup_idea.csv"), show_col_types = FALSE) |> janitor::clean_names()

gdp_per_cap_data <- read_csv(here::here("Datasets", "gdp_pcap.csv"), show_col_types = FALSE) |> janitor::clean_names()

Matching Our Daaset

Since our corruption only has data for years between [1975,2021] we filter our GDP per captia to those years too

Code
gdp_per_cap_data <- gdp_per_cap_data |> select(country, c(x1975:x2021))

Piviot Data to Long Format

Code
curruption_data <- curruption_data |>
  pivot_longer(cols = c(x1975:x2021), names_to = "Year", values_to = "Absence_Corruption_Index") |>
  mutate(Absence_Corruption_Index = as.numeric(Absence_Corruption_Index))

gdp_per_cap_data <- gdp_per_cap_data |>
  pivot_longer(cols = c(x1975:x2021), names_to = "Year", values_to = "GDP_Per_Capita") |>
  mutate(GDP_Per_Capita = if_else(str_detect(GDP_Per_Capita, "k$"), (as.numeric(str_remove_all(GDP_Per_Capita, "k$")) * 10000), as.numeric(GDP_Per_Capita)))

Data Joins

Code
full_data <- curruption_data |> inner_join(gdp_per_cap_data, by = c("country", "Year"))

full_data <- full_data |>
  mutate(country = as_factor(country)) |>
  mutate(region = as.factor(case_when(
    country %in% c(
      "Antigua and Barbuda", "Argentina", "Bahamas", "Barbados",
      "Belize", "Bolivia", "Brazil", "Canada", "Chile", "Colombia",
      "Costa Rica", "Cuba", "Dominica", "Dominican Republic",
      "Ecuador", "El Salvador", "Grenada", "Guatemala", "Guyana",
      "Haiti", "Honduras", "Jamaica", "Mexico", "Nicaragua", "Panama",
      "Paraguay", "Peru", "St. Kitts and Nevis", "St. Lucia",
      "St. Vincent and the Grenadines", "Suriname",
      "Trinidad and Tobago", "USA", "Uruguay",
      "Venezuela"
    ) ~ "The Americas",
    country %in% c(
      "Albania", "Andorra", "Armenia", "Austria", "Azerbaijan",
      "Belarus", "Belgium", "Bosnia and Herzegovina", "Bulgaria",
      "Croatia", "Cyprus", "Czech Republic", "Denmark", "Estonia",
      "Finland", "France", "Georgia", "Germany", "Greece", "Holy See",
      "Hungary", "Iceland", "Ireland", "Italy", "Latvia",
      "Liechtenstein", "Lithuania", "Luxembourg", "North Macedonia",
      "Malta", "Moldova", "Monaco", "Montenegro", "Netherlands",
      "Norway", "Poland", "Portugal", "Romania", "Russia", "San Marino",
      "Serbia", "Slovak Republic", "Slovenia", "Spain", "Sweden",
      "Switzerland", "Turkey", "Ukraine",
      "UK"
    ) ~ "Europe",
    country %in% c(
      "Algeria", "Angola", "Benin", "Botswana", "Burkina Faso",
      "Burundi", "Cameroon", "Cape Verde", "Central African Republic",
      "Chad", "Comoros", "Congo, Dem. Rep.", "Congo, Rep.",
      "Cote d'Ivoire", "Djibouti", "Egypt", "Equatorial Guinea",
      "Eritrea", "Ethiopia", "Gabon", "Gambia", "Ghana", "Guinea",
      "Guinea-Bissau", "Kenya", "Lesotho", "Liberia", "Libya",
      "Madagascar", "Malawi", "Mali", "Mauritania", "Mauritius",
      "Morocco", "Mozambique", "Namibia", "Niger", "Nigeria", "Rwanda",
      "Sao Tome and Principe", "Senegal", "Seychelles",
      "Sierra Leone", "Somalia", "South Africa", "Sudan", "Eswatini",
      "Tanzania", "Togo", "Tunisia", "Uganda", "Zambia", "Zimbabwe",
      "South Sudan"
    ) ~ "Africa",
    country %in% c(
      "Afghanistan", "Australia", "Bahrain", "Bangladesh", "Bhutan",
      "Brunei", "Cambodia", "China", "Fiji", "Hong Kong, China",
      "India", "Indonesia", "Iran", "Iraq", "Israel", "Japan", "Jordan",
      "Kazakhstan", "Kiribati", "North Korea", "South Korea", "Kuwait",
      "Kyrgyz Republic", "Lao", "Lebanon", "Malaysia", "Maldives",
      "Marshall Islands", "Micronesia, Fed. Sts.", "Mongolia",
      "Myanmar", "Nauru", "Nepal", "New Zealand", "Oman", "Pakistan",
      "Palau", "Papua New Guinea", "Philippines", "Qatar", "Samoa",
      "Saudi Arabia", "Singapore", "Solomon Islands", "Sri Lanka",
      "Syria", "Taiwan", "Tajikistan", "Thailand", "Timor-Leste",
      "Tonga", "Turkmenistan", "Tuvalu", "UAE",
      "Uzbekistan", "Vanuatu", "Palestine", "Vietnam",
      "Yemen"
    ) ~ "Asia-Pacific"
  )))

Understanding Our Data

The following ER diagram represents how our data is structured

Code
# ER Diagram
knitr::include_graphics(here::here("Figures", "ERDiagram.png"))

Variable Type Description
🔑Country String Full Name of a Country
🔑Year Int Year This data was collected
Country’s Absence of Corruption Index Optional(Double) A index measuring how corrupt a perceived country’s government is. Lower value means less trust and more corruption. NA means no data is unavailable for that year. Please check the section about NA values for some possible reasons
GDP Per Captia Double Calculated GDP per Captia (GDP/Population)

The two quantitative variables our team have decided to focus on are GDP per Capita (GDP/Population) and Absence of Corruption as an indexed range. These variables are measured for all 172 countries. Below are the descriptions of each and what they represent, specifically in our data set: GDP per Capita (GDP/Population): This variable represents a countries GDP or Gross Domestic Product for a given year divided by the countries population size. In other words, this variable is simply the per-person GDP for that (Ingabire 2020) country.Absence of Corruption Index: This variable represents the lack of corruption within a countries public administration.(Gapminder 2023) This indicator is determined by assessing public sector corrupt exchanges, public sector theft, executive embezzlement and theft, executive bribery and corrupt exchanges. These values are determined by assessing the extent that public officials within an office administration do not use their power for personal benefit or gain. The scaling of this index is such that a lower value represent a higher absence of corruption, or simply put less corruption. Conversely, a high value for this variable represents a lower absence of corruption, or simply put, more corruption. The categorical variable Country refers to the country that is being observed or referred to when assessing the corresponding values for GDP per Capita and Absence of Corruption. The quantitative variable Year, refers to the given year that corresponds to the other variables observations. Year refers to data collected from January to December for the given year.

Summary

Code
full_data |>
  select(country) |>
  summarise(Count = n_distinct(country))

There are a total of 172 countries in this study

Below is a quick summary of the average mean absense corruption Index and GDP per Captia out of every country for every year

Code
full_data |>
  group_by(country) |>
  mutate(across(.cols = c(Absence_Corruption_Index, GDP_Per_Capita), .fns = ~ mean(.x, na.rm = TRUE))) |>
  group_by(Year) |>
  summary() |>
  unclass() |>
  as_tibble() |>
  select(Absence_Corruption_Index, GDP_Per_Capita) |>
  drop_na() |>
  rename("Absence Corruption Index" = Absence_Corruption_Index, " GDP Per Capita" = GDP_Per_Capita)
Absence Corruption Index GDP Per Capita
Min. :11.59 Min. : 781.9
1st Qu.:32.40 1st Qu.: 3385.7
Median :42.69 Median : 40924.4
Mean :46.41 Mean :129519.2
3rd Qu.:57.52 3rd Qu.:198925.5
Max. :98.14 Max. :820276.6

About those Pesky NAs

One thing we do want to acknowledge is the presence of NAs in our datasheet particularly in the corruption index. There could be a multitude of reasons why this that data isn’t present for a country in a particular year: data was lost, data was measured/calculated incorrectly, some disaster prevent the data from being collected, insufficient stability for determination to occur or the government doesn’t want the public to know about their corruption therefore attempts to censor it. All these factors some, randomized some not will be taken into our consideration when determining the relationship between corruption index and GDP per captia

Possible Hypothesis

  • Explanatory Variable : Absence of Curruption of Index
  • Response Variable : GDP Per Captia

We believe that a higher absence of corruption index will lead to a higher GDP per captia. Traditionally are more stable and democratic forms of governments have higher standards of living,less internal/external conflicts, equal opportunities for all individuals and large amounts of well fares for their constituency. They also in general have stronger economies that are more reliant to wild economic swings For instance much of modern Europe has a high GDP per captia because of its modernization, democratic governments and tight integration with the EU.

Data Visualizations

Code
# Renames the year from xYYYY -> YYYY
full_data <- full_data |> mutate(Year = as.numeric(str_remove_all(Year, "^x")))

2021 Absence Of Corruption vs GDP per Capita

Below is a plot of countries and their respective Absence of Corruption Index vrs. GDP Per Captia in the year 2021

Code
full_data |>
  filter(Year == 2021) |>
  ggplot(mapping = aes(x = Absence_Corruption_Index, y = GDP_Per_Capita)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Absence of Corruption vs GDP per Capita 2021", x = "Absence of Corruption", y = "GDP per Capita") +
  theme(axis.title.y = element_text(vjust = 0.5, angle = 0), plot.title = element_text(hjust = 0.5))

Log Transformed 2021 Absence Of Corruption vs GDP per Capita

As many of our countries have quite poor GDP-per-Captia they are all bunched up on the bottom making it quite hard to visualize. We will therefore log transform our Y-axis to get a better view of this relationship

Code
# With Log Transformation
full_data |>
  filter(Year == 2021) |>
  mutate(GDP_Per_Capita = log(GDP_Per_Capita)) |>
  ggplot(mapping = aes(x = Absence_Corruption_Index, y = GDP_Per_Capita)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Absence of Corruption vs GDP per Capita(Log Transformed)\n 2021", x = "Absence of Corruption", y = "Log of GDP per Capita") +
  theme(axis.title.y = element_text(vjust = 0.5, angle = 0), plot.title = element_text(hjust = 0.5))

Looks like our orignal hypothesis seems to have at least some grounds. Lets see if such a trend has been present for the past 50 years(the extent in which we have data for absence of corruption index) or is this just a trend that matriculated recently.

History of Absence Of Corruption vs GDP per Capita

Code
full_data |>
  # Drops NA values cant be plotted...
  drop_na() |>
  ggplot(aes(x = Absence_Corruption_Index, y = GDP_Per_Capita, color = country)) +
  # Plots each country's data
  geom_point(alpha = 0.7, show.legend = FALSE) +
  # Drops the colour guide wouldn't be helpful for 172 countries...
  guides(color = "none") +
  # Seperates by region
  facet_wrap(~region) +
  # Makes a regression line for easier viewing
  geom_smooth(mapping = aes(x = Absence_Corruption_Index, y = GDP_Per_Capita, color = "blue"), method = "lm", se = FALSE) +
  # Tittle stuff
  labs(title = "Absence of Corruption vs GDP per Capita", subtitle = "Year: {frame_time}", x = "Absence of Corruption", y = "GDP er Capita") +
  theme(axis.title.y = element_text(vjust = 0.5, angle = 0), plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
  # Animating by year
  transition_time(as.integer(Year)) +
  ease_aes("linear")

The trend holds!, only a true regression though will tell if there really a correlation and if so how strong?

Data Modeling & Prediction

Creating our Regression Models

While the log transformation gives us a easier way to see the upwards trend between Absence of Corruption Index and the nation’s GDP per Captia, it actually does NOT help our linear model, in fact comparing the computed R-squared values is worse on the log transformed model.

Code
# 2021 Data

full_2021_data <- full_data |> filter(Year == 2021)

# 2021 Data with log transformation

log_2021_data <- full_2021_data |> mutate(GDP_Per_Capita = log(GDP_Per_Capita))
Code
full_2021_data_lm <- lm(GDP_Per_Capita ~ Absence_Corruption_Index, full_2021_data)
log_2021_data_lm <- lm(GDP_Per_Capita ~ Absence_Corruption_Index, log_2021_data)
tibble("R Squared Non Transformed" = summary(full_2021_data_lm)$r.squared, "R Squared Log Transformed" = summary(log_2021_data_lm)$r.squared) |>
  reactable() |>
  add_title("Comparison of R Squared of Two Linear Models", font_size = 20, align = "center")

Comparison of R Squared of Two Linear Models

With that in mind, we will be sticking with our untransformed data for the remainder of this exploration.

Code
# Removes the transformed log models
remove(log_2021_data, log_2021_data_lm)

# Summary of Linear Model :
broom::tidy(full_2021_data_lm) |>
  rename("Regression Variable" = term, "Model Estimate" = estimate, "Standard Error" = std.error) |>
  select(c("Regression Variable":"Standard Error"))
Regression Variable Model Estimate Standard Error
(Intercept) -199747.304 31994.4920
Absence_Corruption_Index 8211.695 623.4122

Our linear model comparing different countries in 2021 and their respective Absence of Corruption Index vrs their 2021 GDP Per Captia yeilds us the following results

\[ \beta_0 = -199747.304\\ \beta_1 = 8211.695 \] which will give us the linear regression formula of : \[ \hat{y} = -199747.304+8211.695x \]

or rewritten with our explanatory and response variables \[ \hat{GDP\_Per\_Captia} = -199747.304+8211.695(Absence\_Corruption\_Index) \]

Analysis

With a \(R^2\) value of 0.50510 approximately 50% of our observed GDP per Captia variability is accounted for in our correlation that for every score increase of 1 for a country’s Absence of Corruption Index their GDP per Captia will increase $8211.66.

Checking the Linear Model’s Accuracy

Although we have a mild \(R^2\) relation between Absence of Corruption Index and its response variable GDP per Captia, we want to still check the model fit. One way to check the average residual of our model

Code
full_2021_data_lm_resd <- full_2021_data_lm |>
  broom::augment() |>
  select(.resid)
full_2021_data_lm_resd |>
  summarize("Avergae Residual" = mean(.resid)) |>
  reactable() |>
  add_title("Model Residual Stats", font_size = 20, align = "center")

Model Residual Stats

With a average residual \(<|10^{-9}|\) this looks pretty promising, but to confirm our calculations and better visualize the distributions of residuals lets plot them on a histogram. What we expect is a normal distribution of all residuals reminiscent of the familiar bell curve.

Code
options(scipen = 100)
full_2021_data_lm_resd |> ggplot(mapping = aes(x = .resid)) +
  geom_histogram(aes(y = after_stat(density)), bins = 50) +
  labs(x = "Residuals", y = "Percentage of Distrubution") +
  geom_density(colour = "steelblue") +
  theme(axis.title.y = element_text(angle = 0, vjust = 0.5))

Code
options(scipen = 0)

`

Simulations and Model

Code
observed_2021_data <- full_2021_data |>
  select(GDP_Per_Capita) |>
  mutate(GDP_Per_Capita = as.numeric(GDP_Per_Capita)) |>
  pull()
predicted_2021_data <- full_2021_data_lm |> predict()
Code
generateRealisticPredictions <- function(predictedValues, standardError) {
  mutatedPredictedValues <- predictedValues + rnorm(length(predictedValues), 0, standardError)
  return(mutatedPredictedValues)
}

findModelFit <- function(linearModel) {
  newPredicted <- linearModel |> broom::augment()
  newPredicted <- newPredicted |>
    select(GDP_Per_Capita, .fitted) |>
    mutate(.fitted = generateRealisticPredictions(.fitted, sigma(linearModel)))
  observedPredictedLm <- lm(GDP_Per_Capita ~ .fitted, newPredicted)
  return(broom::glance(observedPredictedLm) |> select(r.squared) |> pull())
}
Code
set.seed(05081995)
sim_model_res <- map_dbl(.x = c(1:5000), .f = ~ findModelFit(full_2021_data_lm))
summary(sim_model_res) |>
  unclass() |>
  as.list() |>
  as.data.frame() |>
  pivot_longer(cols = c("Min.":"Max."), names_to = "Stat", values_to = "Values") |>
  mutate(Stat = str_remove(Stat, "[X]")) |>
  mutate(Stat = str_replace(Stat, "\\.", " ")) |>
  reactable() |>
  add_title("Model Residual Simulations Stats", font_size = 20, align = "center")

Model Residual Simulations Stats

Plotting the \(R^2\) on a histogram we get

Code
options(scipen = 100)
sim_model_res |>
  as.data.frame() |>
  ggplot(mapping = aes(x = sim_model_res)) +
  geom_histogram(aes(y = after_stat(density)), bins = 50) +
  ggtitle(bquote("Distrubution of " ~ R^2~"between Normalized Simulated Data and Observed Data")) +
  labs(x = bquote(R^2), y = "Percentage of\n Distrubution") +
  geom_density(colour = "steelblue") +
  theme(axis.title.y = element_text(angle = 0, vjust = 0.5), plot.title = element_text(hjust = 0.5))

References

Gapminder. 2023. “_IDEA-Democracy Indices Data - V5.” Gapminder; Gapminder. https://docs.google.com/spreadsheets/d/1jYUZFQOQrE0bAjV9XVgr_92nMT-_ukYBs4Uom4rfVtQ/edit#gid=501532268&range=B17.
Ingabire, Diane. 2020. “Gapminder Documentation 001 -GDP Per Capita, Constant PPP Dollars V26.” Gapminder.org; Gapminder Foundation. https://s3.eu-west-1.amazonaws.com/static.gapminder.org/GapminderMedia/wp-uploads/20210512195520/Documentation_-GDP-per-Capita-v26.pdf.